{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Process Utilization Files and Generate a Pickle Files for Plotting"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Note that running following scripts is time consuming and should be done only once. The pickle files are provided in the `util_pkl` directory."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pytz\n",
    "import pickle\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "from glob import glob\n",
    "from pathlib import Path\n",
    "from utils import cluster_metric_header, dcgm_metric_header\n",
    "\n",
    "\"\"\" cluster_metric_header\n",
    "0: \"CPU\",\n",
    "1: \"MEMORY\",\n",
    "2: \"IB_SEND\",\n",
    "3: \"RECEIVE\",\n",
    "\"\"\"\n",
    "\"\"\" dcgm_metric_header\n",
    "0: XID_ERRORS\n",
    "1: GPU_TEMP\n",
    "2: MEMORY_TEMP\n",
    "3: MEM_CLOCK\n",
    "4: MEM_COPY_UTIL\n",
    "5: FB_FREE\n",
    "6: FB_USED\n",
    "7: DRAM_ACTIVE\n",
    "8: POWER_USAGE\n",
    "9: GPU_UTIL\n",
    "10: PIPE_TENSOR_ACTIVE\n",
    "11: SM_ACTIVE\n",
    "12: SM_OCCUPANCY\n",
    "\"\"\"\n",
    "\n",
    "SAVEPKL = \"./util_pkl\"\n",
    "\n",
    "\n",
    "def read_csv_with_concat(path=\"./csv\", file_name=None):\n",
    "    file = Path(path, f\"{file_name}.csv\")\n",
    "\n",
    "    if file.exists():\n",
    "        # If original file exists, read it directly\n",
    "        df = pd.read_csv(file)\n",
    "        print(f\"Reading {file_name}\")\n",
    "    else:\n",
    "        # If original file does not exist, read all the split files\n",
    "        split_files = sorted(glob(f\"{path}/{file_name}-2023-*.csv\"))\n",
    "        print(f\"Reading splitted files: {split_files}\")\n",
    "        df = pd.concat([pd.read_csv(split_file) for split_file in split_files])\n",
    "        df.reset_index(drop=True, inplace=True)\n",
    "    return df\n",
    "\n",
    "\n",
    "def read_concat_parse_save_cluster_metrics(path=\"./csv_cpu\", metrics=cluster_metric_header):\n",
    "    for metric in metrics:\n",
    "        data = read_csv_with_concat(path=path, file_name=metric)\n",
    "        data.drop_duplicates(subset=[\"Time\"], inplace=True)\n",
    "        data.sort_values(by=\"Time\", inplace=True)\n",
    "        data[\"Time\"] = pd.to_datetime(data[\"Time\"], unit=\"s\").dt.tz_localize(pytz.utc).dt.tz_convert(\"Asia/Shanghai\")\n",
    "        data.set_index(\"Time\", drop=True, inplace=True)\n",
    "        print(f\"Column Number: {len(list(data.columns))}, {len(set(list(data.columns)))}\")\n",
    "\n",
    "        if \"NODE_MEMORY\" in metric:\n",
    "            # Around 2 hours has some bug (ip has additional '.1', like '10.140.0.131' -> '10.140.0.131.1')\n",
    "            data = data[(data.index < \"2023-07-19 11:35:00\") | (data.index > \"2023-07-19 14:01:00\")]\n",
    "\n",
    "        if \"NODE_CPU\" in metric or \"NODE_MEMORY\" in metric:\n",
    "            data = data * 100  # CPU / Memory Utilization (%)\n",
    "\n",
    "        if \"NODE_IB\" in metric:\n",
    "            data.rename(columns=lambda x: x.replace(\"-mlx5_0\", \"\"), inplace=True)  # Simplified, since one IB NIC per server\n",
    "\n",
    "        data.dropna(axis=1, how=\"all\", inplace=True)\n",
    "        data = data.round(3)\n",
    "        data.to_csv(f\"./{metric}.csv\")\n",
    "\n",
    "\n",
    "def read_concat_parse_save_dcgm_metrics(path=\"./csv\", metrics=dcgm_metric_header):\n",
    "    for metric in metrics:\n",
    "        data = read_csv_with_concat(path=path, file_name=metric)\n",
    "        data.drop_duplicates(subset=[\"Time\"], inplace=True)\n",
    "        data.sort_values(by=\"Time\", inplace=True)\n",
    "        data[\"Time\"] = pd.to_datetime(data[\"Time\"], unit=\"s\").dt.tz_localize(pytz.utc).dt.tz_convert(\"Asia/Shanghai\")\n",
    "        data.set_index(\"Time\", drop=True, inplace=True)\n",
    "        print(f\"Column Number: {len(list(data.columns))}, {len(set(list(data.columns)))}\")\n",
    "\n",
    "        # if \"XID\" in metric or \"TEMP\" in metric or \"CLOC\" in metric:\n",
    "        #     data = data.astype(int, errors='ignore')\n",
    "\n",
    "        if \"ACTIVE\" in metric or \"OCCUPANCY\" in metric:\n",
    "            data = data * 100  # CPU / Memory Utilization (%)\n",
    "            data = data.round(3)\n",
    "\n",
    "        if \"POWER\" in metric:\n",
    "            data = data.round(1)\n",
    "\n",
    "        data.dropna(axis=0, how=\"all\", inplace=True)\n",
    "        data.dropna(axis=1, how=\"all\", inplace=True)\n",
    "        data.to_csv(f\"./{metric}.csv\")\n",
    "\n",
    "\n",
    "def calculate_sum_cdf_axis100(df, dot_num=1000):\n",
    "    \"\"\"\n",
    "    Calculate quantity percentile CDF, y-axis: 0-100%,\n",
    "    \"\"\"\n",
    "    print(\"Parsing\")\n",
    "    data = df.melt(id_vars=\"Time\", var_name=\"Server\")\n",
    "    data.dropna(subset=[\"value\"], inplace=True)\n",
    "\n",
    "    y = np.linspace(0, 1, num=dot_num)\n",
    "    x = data[\"value\"].quantile(y).values\n",
    "    y = y * 100\n",
    "    return x, y\n",
    "\n",
    "\n",
    "def calculate_num_cdf_axis100(df, dot_num=1000):\n",
    "    \"\"\"\n",
    "    Calculate quantity percentile CDF, y-axis: 0-100%,\n",
    "    \"\"\"\n",
    "    print(\"Parsing\")\n",
    "    data = df.melt(id_vars=\"Time\", var_name=\"Server\")\n",
    "    data.dropna(subset=[\"value\"], inplace=True)\n",
    "    # data.sort_values('value', ascending=True, inplace=True)\n",
    "    # data.reset_index(drop=True, inplace=True)\n",
    "\n",
    "    y = np.linspace(0, 1, num=dot_num)\n",
    "    x = data[\"value\"].quantile(y).values\n",
    "    y = y * 100\n",
    "    return x, y"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Example 1: Prometheus Metics (e.g., CPU and Memory Utilization)\n",
    "\n",
    "You can change to any metric you want to plot by changing the `file_name` variable in the following script."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_cpu = read_csv_with_concat(path=\"./seren\", file_name=cluster_metric_header[0])\n",
    "data_mem = read_csv_with_concat(path=\"./seren\", file_name=cluster_metric_header[1])\n",
    "x1, y1 = calculate_num_cdf_axis100(data_cpu)\n",
    "x2, y2 = calculate_num_cdf_axis100(data_mem)\n",
    "print(\n",
    "    f'CPU Period: (Start) {data_cpu.at[0, \"Time\"].split(\":\")[0]}h (End) {data_cpu.at[len(data_cpu)-1, \"Time\"].split(\":\")[0]}h'\n",
    ")\n",
    "print(\n",
    "    f'MEM Period: (Start) {data_mem.at[0, \"Time\"].split(\":\")[0]}h (End) {data_mem.at[len(data_mem)-1, \"Time\"].split(\":\")[0]}h'\n",
    ")\n",
    "\n",
    "with open(f\"{SAVEPKL}/util_cpu_mem_seren.pkl\", \"wb\") as file:\n",
    "    pickle.dump([x1, y1, x2, y2], file)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Example 2: NVIDIA DCGM Metics (e.g., GPU and GPU Memory Utilization)\n",
    "\n",
    "You can change to any metric you want to plot by changing the `file_name` variable in the following script."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_gpu_util = read_csv_with_concat(path=\"./seren\", file_name=dcgm_metric_header[9])  # \"DCGM_FI_DEV_GPU_UTIL\"\n",
    "data_gpu_mem = read_csv_with_concat(path=\"./seren\", file_name=dcgm_metric_header[6])  # \"DCGM_FI_DEV_FB_USED\"\n",
    "data_gpu_mem.iloc[:, 1:] = 100 * data_gpu_mem.iloc[:, 1:] / (80 * 1024)\n",
    "x1, y1 = calculate_num_cdf_axis100(data_gpu_util)\n",
    "x2, y2 = calculate_num_cdf_axis100(data_gpu_mem)\n",
    "\n",
    "with open(f\"{SAVEPKL}/util_gpu_util_mem_seren.pkl\", \"wb\") as file:\n",
    "    pickle.dump([x1, y1, x2, y2], file)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Processing IPMI Power Files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_AB = pd.read_csv(\"./ipmi/GPU_AB_Power.csv\", parse_dates=[\"Time\"])\n",
    "df_C = pd.read_csv(\"./ipmi/GPU_C_Power.csv\", parse_dates=[\"Time\"])\n",
    "df_D = pd.read_csv(\"./ipmi/CPU_D_Power.csv\", parse_dates=[\"Time\"])\n",
    "\n",
    "df_A = df_AB.dropna()\n",
    "df_B = df_AB[df_AB.isna().any(axis=1)]  # Type B without MEM_Power record\n",
    "\n",
    "dfs = {\"GPU_A\": df_A, \"GPU_B\": df_B, \"GPU_C\": df_C, \"CPU_D\": df_D}\n",
    "\n",
    "# Extract sys_total_power\n",
    "df_A_power = df_A[[\"Time\", \"Sys_Total_Power\"]]\n",
    "df_B_power = df_B[[\"Time\", \"Sys_Total_Power\"]]\n",
    "df_C_power = df_C[[\"Time\", \"Sys_Total_Power\"]]\n",
    "df_gpu = pd.concat([df_A_power, df_B_power, df_C_power])\n",
    "\n",
    "x1, y1 = calculate_sum_cdf_axis100(df_gpu)\n",
    "x2, y2 = calculate_sum_cdf_axis100(df_D[[\"Time\", \"Sys_Total_Power\"]])\n",
    "\n",
    "with open(f\"./server_power.pkl\", \"wb\") as file:\n",
    "    pickle.dump([x1, y1, x2, y2], file)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Processing Philly GPU Utilization Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "philly = pd.read_csv(\n",
    "    \"./philly/philly_gpu_util.csv\", on_bad_lines=\"skip\", header=0\n",
    ")  # Please refer to their official repo for the data\n",
    "cols = list(philly.columns)\n",
    "philly = philly.drop(columns=[cols[-1]])\n",
    "philly.reset_index(inplace=True)\n",
    "philly.columns = cols\n",
    "philly.rename(columns={\"time\": \"Time\"}, inplace=True)\n",
    "philly = philly.drop(columns=[cols[1]])\n",
    "\n",
    "x1, y1 = calculate_num_cdf_axis100(philly)\n",
    "with open(f\"{SAVEPKL}/util_gpu_util_philly.pkl\", \"wb\") as file:\n",
    "    pickle.dump([x1, y1], file)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
